# Import libraries
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from sentiment import remove_punctuation, word_lemmatizer, sentiment
# Import White House data
WhiteHouse = pd.read_excel('D:\\Dropbox\\Research\\China Foreign Share Discount\\White House\\WhiteHouse.xlsx', parse_dates=['Date'])
# Combine title and content
WhiteHouse['Text'] = WhiteHouse['Title'] + WhiteHouse['Content']
# Keep Chinese related statements
WhiteHouse_china = WhiteHouse.loc[WhiteHouse['Text'].str.contains('China|Chinese|CHINA|CHINESE', regex=True)]
# tokenizer
tokenizer = RegexpTokenizer(r'\w+')
WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: tokenizer.tokenize(x.lower()))
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\1760722837.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: tokenizer.tokenize(x.lower()))
# Remove stopwords
WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: word_lemmatizer(x))
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2850851681.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: word_lemmatizer(x))
# sentiment
positive = []
negative = []
positive_lm = []
negative_lm = []
wordcount = []
for statement in WhiteHouse_china['Text']:
x1,y1,x2,y2,z = sentiment(statement)
positive.append(x1)
negative.append(y1)
positive_lm.append(x2)
negative_lm.append(y2)
wordcount.append(z)
WhiteHouse_china['pos'] = positive
WhiteHouse_china['neg'] = negative
WhiteHouse_china['pos_lm'] = positive_lm
WhiteHouse_china['neg_lm'] = negative_lm
WhiteHouse_china['wordcount'] = wordcount
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy WhiteHouse_china['pos'] = positive C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy WhiteHouse_china['neg'] = negative C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy WhiteHouse_china['pos_lm'] = positive_lm C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy WhiteHouse_china['neg_lm'] = negative_lm C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy WhiteHouse_china['wordcount'] = wordcount
WhiteHouse_china = WhiteHouse_china[['Date', 'pos', 'neg', 'pos_lm', 'neg_lm', 'wordcount']]
WhiteHouse_china.to_csv("sentiment_WhiteHouse.csv")